── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
✔ broom 1.0.7 ✔ rsample 1.2.1
✔ dials 1.4.0 ✔ tune 1.3.0
✔ infer 1.0.7 ✔ workflows 1.2.0
✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
✔ parsnip 1.3.1 ✔ yardstick 1.3.2
✔ recipes 1.1.1
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter() masks stats::filter()
✖ recipes::fixed() masks stringr::fixed()
✖ dplyr::lag() masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step() masks stats::step()
library (powerjoin)
library (glue)
library (vip)
Attaching package: 'vip'
The following object is masked from 'package:utils':
vi
library (baguette)
library (ggplot2)
library (patchwork)
root <- 'https://gdex.ucar.edu/dataset/camels/file'
download.file ('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf' ,
'data/camels_attributes_v2.0.pdf' )
types <- c ("clim" , "geol" , "soil" , "topo" , "vege" , "hydro" )
remote_files <- glue ('{root}/camels_{types}.txt' )
local_files <- glue ('data/camels_{types}.txt' )
walk2 (remote_files, local_files, download.file, quiet = TRUE )
camels <- map (local_files, read_delim, show_col_types = FALSE )
camels <- power_full_join (camels, by = 'gauge_id' )
Question 1
Zero_q_freq means frequency of days with Q (dishcharge)= 0 mm per day.
ggplot (data = camels, aes (x = gauge_lon, y = gauge_lat)) +
borders ("state" , colour = "gray50" ) +
geom_point (aes (color = q_mean)) + scale_color_gradient (low = "pink" , high = "dodgerblue" ) +
ggthemes:: theme_map ()
Question 2
map_aridity <- ggplot (data = camels, aes (x = gauge_lon, y = gauge_lat)) +
borders ("state" , colour = "gray50" ) +
geom_point (aes (color = aridity)) + scale_color_gradient (low = "lightblue" , high = "darkred" ) +
labs (title = "Aridity Data from USGS sites in the US" ,
color = "Aridity" ) +
ggthemes:: theme_map ()
print (map_aridity)
map_p_mean <- ggplot (data = camels, aes (x = gauge_lon, y = gauge_lat)) +
borders ("state" , colour = "gray50" ) +
geom_point (aes (color = p_mean)) + scale_color_gradient (low = "lightblue" , high = "darkblue" ) +
labs (title = "Mean Precipitation from USGS sites in the US" ,
color = "Mean Precipitation" ) +
ggthemes:: theme_map ()
print (map_p_mean)
final_maps <- map_aridity + map_p_mean + plot_layout (ncol = 2 )
print (final_maps)
ggsave ("final_maps.png" , plot = final_maps, width = 10 , height = 6 , dpi = 300 )
camels %>%
select (aridity, p_mean, q_mean) %>%
drop_na () %>%
cor ()
aridity p_mean q_mean
aridity 1.0000000 -0.7550090 -0.5817771
p_mean -0.7550090 1.0000000 0.8865757
q_mean -0.5817771 0.8865757 1.0000000
ggplot (camels, aes (x = aridity, y = p_mean)) +
geom_point (aes (color = q_mean)) +
geom_smooth (method = "lm" , color = "red" , linetype = 2 ) +
scale_color_viridis_c () +
theme_linedraw () +
theme (legend.position = "bottom" ) +
labs (title = "Aridity vs Rainfall vs Runnoff" ,
x = "Aridity" ,
y = "Rainfall" ,
color = "Mean Flow" )
`geom_smooth()` using formula = 'y ~ x'
ggplot (camels, aes (x = aridity, y = p_mean)) +
geom_point (aes (color = q_mean)) +
geom_smooth (method = "lm" ) +
scale_color_viridis_c () +
scale_x_log10 () +
scale_y_log10 () +
theme_linedraw () +
theme (legend.position = "bottom" ) +
labs (title = "Aridity vs Rainfall vs Runoff" ,
x = "Aridity" ,
y = "Rainfall" ,
color = "Mean Flow" )
`geom_smooth()` using formula = 'y ~ x'
ggplot (camels, aes (x = aridity, y = p_mean)) +
geom_point (aes (color = q_mean)) +
geom_smooth (method = "lm" ) +
scale_color_viridis_c (trans = "log" ) +
scale_x_log10 () +
scale_y_log10 () +
theme_linedraw () +
theme (legend.position = "bottom" ,
legend.key.width = unit (2.5 , "cm" ),
legend.key.height = unit (.5 , "cm" )) +
labs (title = "Aridity vs Rainfall vs Runnoff" ,
x = "Aridity" ,
y = "Rainfall" ,
color = "Mean Flow" )
`geom_smooth()` using formula = 'y ~ x'
set.seed (123 )
camels <- camels %>%
mutate (logQmean = log (q_mean))
camels_split <- initial_split (camels, prop = 0.8 )
camels_train <- training (camels_split)
camels_test <- testing (camels_split)
camels_cv <- vfold_cv (camels_train, v = 10 )
rec <- recipe (logQmean ~ aridity + p_mean, data = camels_train) %>%
step_log (all_predictors ()) %>%
step_interact (terms = ~ aridity: p_mean) %>%
step_naomit (all_predictors (), all_outcomes ())
baked_data <- prep (rec, camels_train) %>%
bake (new_data = NULL )
lm_base <- lm (logQmean ~ aridity * p_mean, data = baked_data)
summary (lm_base)
Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)
Residuals:
Min 1Q Median 3Q Max
-2.91162 -0.21601 -0.00716 0.21230 2.85706
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.77586 0.16365 -10.852 < 2e-16 ***
aridity -0.88397 0.16145 -5.475 6.75e-08 ***
p_mean 1.48438 0.15511 9.570 < 2e-16 ***
aridity:p_mean 0.10484 0.07198 1.457 0.146
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared: 0.7697, Adjusted R-squared: 0.7684
F-statistic: 591.6 on 3 and 531 DF, p-value: < 2.2e-16
summary (lm (logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))
Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean,
data = baked_data)
Residuals:
Min 1Q Median 3Q Max
-2.91162 -0.21601 -0.00716 0.21230 2.85706
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.77586 0.16365 -10.852 < 2e-16 ***
aridity -0.88397 0.16145 -5.475 6.75e-08 ***
p_mean 1.48438 0.15511 9.570 < 2e-16 ***
aridity_x_p_mean 0.10484 0.07198 1.457 0.146
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared: 0.7697, Adjusted R-squared: 0.7684
F-statistic: 591.6 on 3 and 531 DF, p-value: < 2.2e-16
test_data <- bake (prep (rec), new_data = camels_test)
test_data$ lm_pred <- predict (lm_base, newdata = test_data)
metrics (test_data, truth = logQmean, estimate = lm_pred)
# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.583
2 rsq standard 0.742
3 mae standard 0.390
ggplot (test_data, aes (x = logQmean, y = lm_pred, colour = aridity)) +
scale_color_gradient2 (low = "brown" , mid = "orange" , high = "darkgreen" ) +
geom_point () +
geom_abline (linetype = 2 ) +
theme_linedraw () +
labs (title = "Linear Model: Observed vs Predicted" ,
x = "Observes Log Mean Flow" ,
y = "Predicted Log Mean Flow" ,
color = "Aridity" )
lm_model <- linear_reg () %>%
set_engine ("lm" ) %>%
set_mode ("regression" )
lm_wf <- workflow () %>%
add_recipe (rec) %>%
add_model (lm_model) %>%
fit (data = camels_train)
summary (extract_fit_engine (lm_wf))$ coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity -0.8839738 0.16144589 -5.475357 6.745512e-08
p_mean 1.4843771 0.15511117 9.569762 4.022500e-20
aridity_x_p_mean 0.1048449 0.07198145 1.456555 1.458304e-01
summary (lm_base)$ coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity -0.8839738 0.16144589 -5.475357 6.745512e-08
p_mean 1.4843771 0.15511117 9.569762 4.022500e-20
aridity:p_mean 0.1048449 0.07198145 1.456555 1.458304e-01
lm_data <- augment (lm_wf, new_data = camels_test)
dim (lm_data)
metrics (lm_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.583
2 rsq standard 0.742
3 mae standard 0.390
ggplot (lm_data, aes (x = logQmean, y = .pred, colour = aridity)) +
scale_color_viridis_c () +
geom_point () +
geom_abline () +
theme_linedraw ()
library (baguette)
rf_model <- rand_forest () %>%
set_engine ("ranger" , importance = "impurity" ) %>%
set_mode ("regression" )
rf_wf <- workflow () %>%
add_recipe (rec) %>%
add_model (rf_model) %>%
fit (data = camels_train)
rf_data <- augment (rf_wf, new_data = camels_test)
dim (rf_data)
metrics (rf_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.587
2 rsq standard 0.741
3 mae standard 0.363
ggplot (rf_data, aes (x = logQmean, y = .pred, colour = aridity)) +
scale_color_viridis_c () +
geom_point () +
geom_abline () +
theme_linedraw ()
wf <- workflow_set (list (rec), list (lm_model, rf_model)) %>%
workflow_map ('fit_resamples' , resamples = camels_cv)
autoplot (wf)
rank_results (wf, rank_metric = "rsq" , select_best = TRUE )
# A tibble: 4 × 9
wflow_id .config .metric mean std_err n preprocessor model rank
<chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
1 recipe_rand_fore… Prepro… rmse 0.563 0.0247 10 recipe rand… 1
2 recipe_rand_fore… Prepro… rsq 0.771 0.0259 10 recipe rand… 1
3 recipe_linear_reg Prepro… rmse 0.569 0.0260 10 recipe line… 2
4 recipe_linear_reg Prepro… rsq 0.770 0.0223 10 recipe line… 2
Question 3
xgb_model <- boost_tree () %>%
set_engine ("xgboost" ) %>%
set_mode ("regression" )
nn_model <- bag_mlp () %>%
set_engine ("nnet" ) %>%
set_mode ("regression" )
wf <- workflow_set (
list (rec),
list (
lm_model = lm_model,
rf_model = rf_model,
xgb_model = xgb_model,
nn_model = nn_model
)
) %>%
workflow_map ("fit_resamples" , resamples = camels_cv)
rank_results (wf, rank_metric = "rsq" , select_best = TRUE )
# A tibble: 8 × 9
wflow_id .config .metric mean std_err n preprocessor model rank
<chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
1 recipe_nn_model Preproc… rmse 0.547 0.0308 10 recipe bag_… 1
2 recipe_nn_model Preproc… rsq 0.787 0.0266 10 recipe bag_… 1
3 recipe_rf_model Preproc… rmse 0.565 0.0247 10 recipe rand… 2
4 recipe_rf_model Preproc… rsq 0.770 0.0258 10 recipe rand… 2
5 recipe_lm_model Preproc… rmse 0.569 0.0260 10 recipe line… 3
6 recipe_lm_model Preproc… rsq 0.770 0.0223 10 recipe line… 3
7 recipe_xgb_model Preproc… rmse 0.600 0.0289 10 recipe boos… 4
8 recipe_xgb_model Preproc… rsq 0.745 0.0268 10 recipe boos… 4
# The random forest model was ranked the best, so I would move forward with this one. It was ranked as the model with the highest r-squared.
Question 4
set.seed (123 )
camels_split2 <- initial_split (camels, prop = 0.75 )
camels_train2 <- training (camels_split2)
camels_test2 <- testing (camels_split2)
camels_cv2 <- vfold_cv (camels_train2, v = 10 )
rec2 <- recipe (logQmean ~ aridity + p_mean, data = camels_train2) %>%
step_log (all_predictors ()) %>%
step_interact (terms = ~ aridity: p_mean) %>%
step_naomit (all_predictors (), all_outcomes ())
# I chose to use aridity and mean precipitation to predict logQmean because they would both have an impact on mean daily discharge, especially precipitation.
rf_model2 <- rand_forest () %>%
set_engine ("ranger" ) %>%
set_mode ("regression" )
lm_model2 <- linear_reg () %>%
set_engine ("lm" ) %>%
set_mode ("regression" )
nn_model2 <- bag_mlp () %>%
set_engine ("nnet" ) %>%
set_mode ("regression" )
wf2 <- workflow_set (list (rec2), list (rf_model2, lm_model2, nn_model2)) %>%
workflow_map ('fit_resamples' , resamples = camels_cv)
rank_results (wf2, rank_metric = "rsq" , select_best = TRUE )
# A tibble: 6 × 9
wflow_id .config .metric mean std_err n preprocessor model rank
<chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
1 recipe_linear_reg Prepro… rmse 0.569 0.0260 10 recipe line… 1
2 recipe_linear_reg Prepro… rsq 0.770 0.0223 10 recipe line… 1
3 recipe_rand_fore… Prepro… rmse 0.565 0.0245 10 recipe rand… 2
4 recipe_rand_fore… Prepro… rsq 0.769 0.0257 10 recipe rand… 2
5 recipe_bag_mlp Prepro… rmse 0.654 0.112 10 recipe bag_… 3
6 recipe_bag_mlp Prepro… rsq 0.745 0.0466 10 recipe bag_… 3
# I think the nn_model2 is the best because it has the lowest rmse and the highest r-squared. It is also ranked the best below.
nn_wf <- workflow () %>%
add_recipe (rec2) %>%
add_model (nn_model2) %>%
fit (data = camels_train2)
nn_data <- augment (nn_wf, new_data = camels_test2)
dim (nn_data)
ggplot (nn_data, aes (x = logQmean, y = .pred, colour = aridity)) +
scale_color_viridis_c () +
geom_point () +
geom_abline () +
theme_linedraw () +
labs (title = "Observed vs. Predicted Values" , x = "Log Mean Discharge" , y = "Predicted" , color = "Aridity" )
# I think the results look great as most points align, although there are some that deviate. I think this model overall did a great job of predicting the values.